/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.uk
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is HadoopIndexing.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
* Richard McCreadie <richardm{a.}dcs.gla.ac.uk> (original author)
* Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*/
package org.terrier.applications;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.log4j.Logger;
import org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer;
import org.terrier.indexing.hadoop.Hadoop_BlockSinglePassIndexer;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.BitPostingIndexInputStream;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.indexing.LexiconBuilder;
import org.terrier.structures.indexing.singlepass.hadoop.MapEmittedPostingList;
import org.terrier.structures.indexing.singlepass.hadoop.MultiFileCollectionInputFormat;
import org.terrier.structures.indexing.singlepass.hadoop.SplitEmittedTerm;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.FieldScore;
import org.terrier.utility.Files;
import org.terrier.utility.io.HadoopPlugin;
import org.terrier.utility.io.HadoopUtility;
/**
* Main run class for the MapReduce indexing system.
* Provides facilities to preform indexing over multiple
* machines in a MapReduce cluster.
* <p><h3>Input</h3>
* The collection is assumed to be a list of files, as specified in the collection.spec. For more advanced collections,
* this class will be need to be changed. The files listed in collection.spec are assumed to be on the Hadoop shared default
* filesystem - usually HDFS (else Hadoop will throw an error).
* </p>
* <p><h3>Output</h3>
* This class creates indices for the indexed collection, in the directory specified by <tt>terrier.index.path</tt>. If this
* folder is NOT on the Hadoop shared default (e.g. HDFS), then Hadoop will throw an error.
* </p>
* <p>
* <h3>Reducers</h3>
* Two reduce modes are supported: <i>term-partitioning</i> creates
* a single index with multiple files making up the inverted structure; <i>document-partitioning</i>
* creates mulitiple indices, partitioned by docid. More reduce tasks results in higher indexing
* speed due to greater concurrency.
* <p>
* Term-partitioning is the default scenario. In this scenario, the maximum reducers allowed is
* 32. To select document-partitioning, specify the -p flag to main();
* <p>
* <b>Properties:</b>
* <ul>
* <li><tt>terrier.hadoop.indexing.reducers</tt> - number of reduce tasks, defaults to 26.</li>
* <li>If <tt>block.indexing</tt> is set, then a block index will be created.</li>
* </ul>
*
* @author Richard McCreadie and Craig Macdonald
* @since 2.2
*/
@SuppressWarnings("deprecation")
public class HadoopIndexing
{
static final int MAX_REDUCE = 26;
/** logger for this class */
protected static final Logger logger = Logger.getLogger(HadoopIndexing.class);
private static String usage()
{
return "Usage: HadoopIndexing [-p]";
}
/** Starts the MapReduce indexing.
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
long time = System.currentTimeMillis();
boolean docPartitioned = false;
int numberOfReducers = Integer.parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
if (args.length==2 && args[0].equals("-p"))
{
//logger.info("Document-partitioned Mode, "+numberOfReducers+" output indices.");
numberOfReducers = Integer.parseInt(args[1]);
docPartitioned = true;
}
else if (args.length == 1 && args[0].equals("--merge"))
{
if (numberOfReducers > 1)
mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
else
logger.error("No point merging 1 reduce task output");
return;
}
else if (args.length == 0)
{
//logger.info("Term-partitioned Mode, "+numberOfReducers+" reducers creating one inverted index.");
docPartitioned = false;
if (numberOfReducers > MAX_REDUCE)
{
//logger.warn("Excessive reduce tasks ("+numberOfReducers+") in use "
// +"- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use "+MAX_REDUCE+" at most");
}
} else
{
logger.fatal(usage());
return;
}
if (jf == null)
throw new Exception("Could not get JobFactory from HadoopPlugin");
final JobConf conf = jf.newJob();
conf.setJobName("terrierIndexing");
if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX))
{
logger.fatal("Cannot index while index exists at "
+ApplicationSetup.TERRIER_INDEX_PATH+"," + ApplicationSetup.TERRIER_INDEX_PREFIX);
return;
}
boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
if (blockIndexing)
{
conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
}
else
{
conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
}
FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
conf.setMapOutputKeyClass(SplitEmittedTerm.class);
conf.setMapOutputValueClass(MapEmittedPostingList.class);
conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);
if (! conf.get("mapred.job.tracker").equals("local"))
{
conf.setMapOutputCompressorClass(GzipCodec.class);
conf.setCompressMapOutput(true);
}
else
{
conf.setCompressMapOutput(false);
}
conf.setInputFormat(MultiFileCollectionInputFormat.class);
conf.setOutputFormat(NullOutputFormat.class);
conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
conf.setReduceSpeculativeExecution(false);
//parse the collection.spec
BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
String line = null;
List<Path> paths = new ArrayList<Path>();
while((line = specBR.readLine()) != null)
{
if (line.startsWith("#"))
continue;
paths.add(new Path(line));
}
specBR.close();
FileInputFormat.setInputPaths(conf,paths.toArray(new Path[paths.size()]));
conf.setNumReduceTasks(numberOfReducers);
if (numberOfReducers> 1)
{
if (docPartitioned)
conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
else
conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
}
else
{
//for JUnit tests, we seem to need to restore the original partitioner class
conf.setPartitionerClass(HashPartitioner.class);
}
JobID jobId = null;
boolean ranOK = true;
try{
RunningJob rj = JobClient.runJob(conf);
jobId = rj.getID();
HadoopUtility.finishTerrierJob(conf);
} catch (Exception e) {
logger.error("Problem running job", e);
ranOK = false;
}
if (jobId != null)
{
deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
}
if (ranOK)
{
if (! docPartitioned)
{
if (numberOfReducers > 1)
mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
}
Hadoop_BasicSinglePassIndexer.finish(
ApplicationSetup.TERRIER_INDEX_PATH,
docPartitioned ? numberOfReducers : 1,
jf);
}
System.out.println("Time Taken = "+((System.currentTimeMillis()-time)/1000)+" seconds");
jf.close();
}
/** for term partitioned indexing, this method merges the lexicons from each reducer
* @param index_path path of index
* @param numberOfReducers number of inverted files expected
*/
@SuppressWarnings("unchecked")
protected static void mergeLexiconInvertedFiles(String index_path, int numberOfReducers) throws IOException {
final String lexiconStructure = "lexicon";
final String tmpLexiconStructure = "newlex";
final String invertedStructure = "inverted";
//we're handling indices as streams, so dont need to load it. but remember previous status
//moreover, our indices dont have document objects, so errors may occur in preloading
final boolean indexProfile = Index.getIndexLoadingProfileAsRetrieval();
Index.setIndexLoadingProfileAsRetrieval(false);
//1. load in the input indices
final Index[] srcIndices = new Index[numberOfReducers];
final boolean[] existsIndices = new boolean[numberOfReducers];
Arrays.fill(existsIndices, true);
for(int i=0;i<numberOfReducers;i++)
{
final String index_prefix = ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+i;
srcIndices[i] = Index.createIndex(index_path, index_prefix);
if (srcIndices[i] == null)
{
//remove any empty inverted file for this segment
Files.delete(BitPostingIndexInputStream.getFilename(index_path, index_prefix, invertedStructure, (byte)1, (byte)1));
//remember that this index doesnt exist
existsIndices[i] = false;
//logger.warn("No reduce "+i+" output : no output index ["+index_path+","+index_prefix+ "]");
}
}
//2. the target index is the first source index
Index dest = srcIndices[0] != null ? srcIndices[0] : Index.createIndex(index_path, ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+0);
if (dest == null)
{
throw new IllegalArgumentException("No index found at " + index_path + ","+ ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+0);
}
//3. create the new lexicon
LexiconOutputStream<String> lexOut = new FSOMapFileLexiconOutputStream(
dest, tmpLexiconStructure,
(FixedSizeWriteableFactory<Text>) dest.getIndexStructure(lexiconStructure + "-keyfactory"),
(Class<? extends FixedSizeWriteableFactory<LexiconEntry>>) dest.getIndexStructure(lexiconStructure + "-valuefactory").getClass());
//4. append each source lexicon on to the new lexicon, amending the filenumber as we go
int termId = 0;
for(int i=0;i<numberOfReducers;i++)
{
//the partition did not have any stuff
if (! existsIndices[i])
{
//touch an empty inverted index file for this segment, as BitPostingIndex requires that all of the files exist
Files.writeFileStream(BitPostingIndexInputStream.getFilename(
dest, invertedStructure, (byte)numberOfReducers, (byte)i)).close();
continue;
}
//else, append the lexicon
Iterator<Map.Entry<String,LexiconEntry>> lexIn = (Iterator<Map.Entry<String, LexiconEntry>>) srcIndices[i].getIndexStructureInputStream("lexicon");
while(lexIn.hasNext())
{
Map.Entry<String,LexiconEntry> e = lexIn.next();
e.getValue().setTermId(termId);
((BitIndexPointer)e.getValue()).setFileNumber((byte)i);
lexOut.writeNextEntry(e.getKey(), e.getValue());
termId++;
}
IndexUtil.close(lexIn);
//rename the inverted file to be part of the destination index
Files.rename(
BitPostingIndexInputStream.getFilename(srcIndices[i], invertedStructure, (byte)1, (byte)1),
BitPostingIndexInputStream.getFilename(dest, invertedStructure, (byte)numberOfReducers, (byte)i));
}
lexOut.close();
//5. change over lexicon structures
final String[] structureSuffices = new String[]{"", "-entry-inputstream"};
//remove old lexicon structures
for (String suffix : structureSuffices)
{
// if (! IndexUtil.deleteStructure(dest, lexiconStructure + suffix))
//logger.warn("Structure " + lexiconStructure + suffix + " not found when removing");
}
//rename new lexicon structures
for (String suffix : structureSuffices)
{
// if (! IndexUtil.renameIndexStructure(dest, tmpLexiconStructure + suffix, lexiconStructure + suffix))
//logger.warn("Structure " + tmpLexiconStructure + suffix + " not found when renaming");
}
//6. update destimation index
if (FieldScore.FIELDS_COUNT > 0)
dest.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
dest.setIndexProperty("index."+invertedStructure+".data-files", ""+numberOfReducers);
LexiconBuilder.optimise(dest, lexiconStructure);
dest.flush();
//7. close source and dest indices
for(Index src: srcIndices) //dest is also closed
{
if (src != null)
src.close();
}
//8. rearrange indices into desired layout
//rename target index
IndexUtil.renameIndex(index_path, ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+0, index_path, ApplicationSetup.TERRIER_INDEX_PREFIX);
//delete other source indices
for(int i=1;i<numberOfReducers;i++)
{
if (existsIndices[i])
IndexUtil.deleteIndex(index_path, ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+i);
}
//restore loading profile
Index.setIndexLoadingProfileAsRetrieval(indexProfile);
}
/** Performs cleanup of an index path removing temporary files */
public static void deleteTaskFiles(String path, JobID job)
{
String[] fileNames = Files.list(path);
if (fileNames == null)
return;
for(String filename : fileNames)
{
String periodParts[] = filename.split("\\.");
try{
TaskID tid = TaskID.forName(periodParts[0]);
if (tid.getJobID().equals(job))
{
// if (! Files.delete(path + "/" + filename))
//logger.warn("Could not delete temporary map side-effect file "+ path + "/" + filename);
}
} catch (Exception e) {}
}
}
}